{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Description:\n",
    "这次实验依然是用的Criteo数据集， 只不过由于原来的数据量太大， 为了在单机上能够运行， 做了采样， 取了很少的一部分进行实验。数据集位于data/文件夹下， train.csv是训练集， test.csv是测试集。 这个笔记本我们是做数据的读入和预处理操作， 具体步骤如下：\n",
    "1. 读入数据集， 并进行缺失值的填充， 这里为了简单一些， 直接类别特征填充“-1”， 数值特征填充0\n",
    "2. 类别特征的编码， 用的LabelEncoder编码， 数值特征的归一化处理\n",
    "3. 划分开训练集和验证集保存到prepeocessed/文件夹下"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 导入包和数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-28T11:41:53.322233Z",
     "start_time": "2020-12-28T11:41:51.051140Z"
    }
   },
   "outputs": [],
   "source": [
    "\"\"\"导入包\"\"\"\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from sklearn.preprocessing import MinMaxScaler, LabelEncoder\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-28T11:43:10.340452Z",
     "start_time": "2020-12-28T11:43:10.331433Z"
    }
   },
   "outputs": [],
   "source": [
    "\"\"\"特征处理\"\"\"\n",
    "def sparseFeature(feat, feat_num, embed_dim=4):\n",
    "    \"\"\"\n",
    "    create dictionary for sparse feature\n",
    "    :param feat: feature_name\n",
    "    :param feat_num: the total number of sparse features that do not repeat\n",
    "    :param embed_dim: embedding dimension\n",
    "    :return\n",
    "    \"\"\"\n",
    "    return {'feat': feat, 'feat_num': feat_num, 'embed_dim': embed_dim}\n",
    "\n",
    "def denseFeature(feat):\n",
    "    \"\"\"\n",
    "    create dictionary for dense feature\n",
    "    :param feat: dense feature name\n",
    "    : return\n",
    "    \"\"\"\n",
    "    return {'feat': feat}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-28T11:44:32.830715Z",
     "start_time": "2020-12-28T11:44:32.810769Z"
    }
   },
   "outputs": [],
   "source": [
    "# 读入数据集，并进行预处理\n",
    "def create_cretio_data(embed_dim=8, test_size=0.2):\n",
    "    # import data\n",
    "    train_df = pd.read_csv('./data/train.csv')\n",
    "    test_df = pd.read_csv('./data/test.csv')\n",
    "    \n",
    "    # 进行数据合并\n",
    "    label = train_df['Label']\n",
    "    del train_df['Label']\n",
    "\n",
    "    data_df = pd.concat((train_df, test_df))\n",
    "    del data_df['Id']\n",
    "    \n",
    "    print(data_df.columns)\n",
    "    # 特征分开类别\n",
    "    sparse_feas = [col for col in data_df.columns if col[0] == 'C']\n",
    "    dense_feas = [col for col in data_df.columns if col[0] == 'I']\n",
    "    \n",
    "    # 填充缺失值\n",
    "    data_df[sparse_feas] = data_df[sparse_feas].fillna('-1')\n",
    "    data_df[dense_feas] = data_df[dense_feas].fillna(0)\n",
    "    \n",
    "    # 把特征列保存成字典, 方便类别特征的处理工作\n",
    "    feature_columns = [[denseFeature(feat) for feat in dense_feas]] + [[sparseFeature(feat, len(data_df[feat].unique()), embed_dim=embed_dim) for feat in sparse_feas]]\n",
    "    np.save('preprocessed_data/fea_col.npy', feature_columns)\n",
    "    \n",
    "    \n",
    "    # 数据预处理\n",
    "    # 进行编码  类别特征编码\n",
    "    for feat in sparse_feas:\n",
    "        le = LabelEncoder()\n",
    "        data_df[feat] = le.fit_transform(data_df[feat])\n",
    "    \n",
    "    # 数值特征归一化\n",
    "    mms = MinMaxScaler()\n",
    "    data_df[dense_feas] = mms.fit_transform(data_df[dense_feas])\n",
    "    \n",
    "    # 分开测试集和训练集\n",
    "    train = data_df[:train_df.shape[0]]\n",
    "    test = data_df[train_df.shape[0]:]\n",
    "\n",
    "    train['Label'] = label\n",
    "    \n",
    "    # 划分验证集\n",
    "    train_set, val_set = train_test_split(train, test_size = 0.2, random_state=2020)\n",
    "    \n",
    "    # 保存文件\n",
    "    train_set.reset_index(drop=True, inplace=True)\n",
    "    val_set.reset_index(drop=True, inplace=True)\n",
    "\n",
    "    train_set.to_csv('preprocessed_data/train_set.csv', index=0)\n",
    "    val_set.to_csv('preprocessed_data/val_set.csv', index=0)\n",
    "    test.to_csv('preprocessed_data/test_set.csv', index=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-28T11:44:35.303546Z",
     "start_time": "2020-12-28T11:44:35.142999Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11',\n",
      "       'I12', 'I13', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9',\n",
      "       'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19',\n",
      "       'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26'],\n",
      "      dtype='object')\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\ZhongqiangWu\\Anaconda3\\envs\\pytorch_gpu\\lib\\site-packages\\ipykernel_launcher.py:42: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
     ]
    }
   ],
   "source": [
    "create_cretio_data()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  },
  "latex_envs": {
   "LaTeX_envs_menu_present": true,
   "autoclose": false,
   "autocomplete": true,
   "bibliofile": "biblio.bib",
   "cite_by": "apalike",
   "current_citInitial": 1,
   "eqLabelWithNumbers": true,
   "eqNumInitial": 1,
   "hotkeys": {
    "equation": "Ctrl-E",
    "itemize": "Ctrl-I"
   },
   "labels_anchors": false,
   "latex_user_defs": false,
   "report_style_numbering": false,
   "user_envs_cfg": false
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
